The purpose of this markdown is to explore the relationship between TOC and snowmelt signals within the CLP, with the intention of creating a baseline forecast model to measure further model development against.

This script is an adapted version of baseline_TOC.Rmd that includes data from the City of Fort Collins, including their historical data going back to 2008.

Harmonization of ROSS and FC data

# grab chemistry data - this is from the zenodo pull
ross_toc <- read_csv(here("data/upper_clp_dss/ross_clp_chem/data/cleaned/CLP_chemistry_up_to_20250701.csv")) %>% 
  rename(site_name = Site,
         date = Date) %>% 
  select(site_code, site_name = site_name, date = date,
         TOC, DOC, distance_upstream_km, Lat:location_type, -watershed) %>% 
  mutate(collector = "ROSS")

# grab distance upstream for FC samples
dist_upstream <- ross_toc %>% 
  select(site_code, distance_upstream_km, Lat, Long, Campaign, location_type) %>% 
  distinct()

# and TOC from FC
FC_chem_2018_2021 <- read_xlsx(here("data/upper_clp_dss/fc_clp_chem/UCLP_CWQMP_database_2008-2021.xlsx")) %>% 
  mutate(Date = ymd(Date),
         site_code = ShortDesc,
         TOC = as.numeric(TOC)) %>% 
  select(date = Date, 
         site_name = LongDesc,
         site_code, TOC) %>% 
  filter(!is.na(TOC)) %>% 
  mutate(collector = "FC")

FC_chem_2022_2024 <- read_csv(here("data/upper_clp_dss/fc_clp_chem/UCLP_CWQMP_database_2022-24.csv")) %>%
  mutate(Date = as.POSIXct(Date, format = "%m/%d/%y"),
         site_code = ShortDesc) %>%
  select(date = Date, 
         location_type = SiteType,
         site_name = LongDesc,
         site_code, TOC) %>% 
  filter(!is.na(TOC)) %>% 
  mutate(collector = "FC")

# get site type for 2025 toc data
FC_site_helper <- FC_chem_2022_2024 %>% 
  select(location_type, site_code) %>% 
  distinct()

FC_chem_2025 <- read_xlsx("data/upper_clp_dss/fc_clp_chem/UCLP_CWQMP_TOC_202504-202506.xlsx") %>%
  select(date = `Sampled Date`, 
         site_name = `Location Name`,
         TOC = `Corrected Result`) %>%
  mutate(flag = if_else(TOC == "ND", "ND", ""),
         site_code = str_sub(site_name, 1, 3),
         TOC = as.numeric(TOC)) %>%
  mutate(collector = "FC") %>% 
  left_join(., FC_site_helper)
# warning okay

FC_toc <- reduce(list(FC_chem_2018_2021, FC_chem_2022_2024, FC_chem_2025), full_join) %>% 
  mutate(date = as_date(date)) %>% 
  select(-location_type) %>% 
  left_join(., dist_upstream)

toc <- full_join(ross_toc, FC_toc)

Exploring DOC/TOC interoperability

Look at relationship between TOC and DOC, does this get us some “more” data that we can use DOC “as” TOC.

toc %>% 
  filter(!is.na(TOC) & !is.na(DOC)) %>% 
  ggplot(., aes(x = TOC, y = DOC)) + 
  geom_point() + 
  facet_wrap(site_name ~ .)

The answer is, kind of yes. Exceptions: basically everything after Canyon Mouth. Let’s also look at water source type for this:

toc %>% 
  filter(!is.na(TOC) & !is.na(DOC)) %>% 
  ggplot(., aes(x = TOC, y = DOC)) + 
  geom_point() + 
  facet_wrap(location_type ~ .)

Let’s look at mainstem here:

toc %>% 
  filter(!is.na(TOC) & !is.na(DOC) & location_type == "Mainstem") %>% 
  ggplot(., aes(x = TOC, y = DOC)) + 
  geom_point() + 
  facet_wrap(Campaign ~ .)

Okay, this aligns with the relationship falling apart after the Canyon Mouth.

Let’s also look at the stream type:

toc %>% 
  filter(!is.na(TOC) & !is.na(DOC) & location_type == "Stream") %>% 
  ggplot(., aes(x = TOC, y = DOC)) + 
  geom_point() + 
  facet_wrap(Campaign ~ .)

Let’s also look at the South Fork

toc %>% 
  filter(!is.na(TOC) & !is.na(DOC) & Campaign == "South Fork") %>% 
  ggplot(., aes(x = TOC, y = DOC)) + geom_point() + 
  facet_wrap(site_name ~ .)

Okay, these are all right-on - the correlation is something like 0.97, the intercept is nearly zero and the slope is nearly 1, so we’re just going to say that for these sites, DOC = TOC.

toc %>% 
  filter(!is.na(TOC) & !is.na(DOC) & 
           (grepl("Upper", Campaign) | Campaign == "South Fork")) %>% 
  ggplot(., aes(x = TOC, y = DOC, color = factor(year(date)))) + geom_point() + 
  facet_wrap(site_name + location_type ~ .)

Let’s look at DOC timeseries across the upper sites and South Fork:

toc %>% 
  filter(!is.na(DOC)) %>% 
  filter(grepl("Upper", Campaign) | Campaign == "South Fork") %>% 
  mutate(dist_name = paste(distance_upstream_km, site_name, sep = " - ")) %>% 
  ggplot(., aes(x = yday(date), y = DOC, color = factor(year(date)))) + 
  geom_point() + 
  facet_wrap(dist_name + location_type ~.)

And removing post-fire 2021, 2022, since we’ll assume that DOC is not as dominant of a part of TOC during this time, or at the very least might be a different proportion of TOC during this recovery time. (We only have DOC-TOC matches post-fire.) Additionally, limiting DOC interpretation to those that have 2015 data.

toc %>% 
  filter(!year(date) %in% c(2021, 2022) & 
           site_code %in% c("PSF", "PBR", "SFM", "PNF", "PJW")) %>% 
  mutate(ordered_site = factor(site_code, levels = c("PNF", "PSF", 
                                                     "PBR", "SFM", 
                                                     "PJW"))) %>% 
  filter(!is.na(DOC)) %>% 
  ggplot(., aes(x = yday(date), y = DOC, color = factor(year(date)))) + 
  geom_point() + 
  facet_wrap(ordered_site + distance_upstream_km + Campaign ~.)

The consistency between 2015 and post-fire leads me to believe that there is probably consistency in the relationship between DOC and TOC in 2015 data and post-fire data, since it doesn’t seem significantly larger or smaller than pre-fire. We’re assuming that these water years were pretty similar in this assessment.

For now, we’re going to limit even further to a handful of sites with good data records. The sites here are those that are on the mainstem and not as affected by reservoir signals. In particular, we drop JWC since it is primarily reservoir fed.

toc %>% 
  filter(site_code %in% c("PBD", "PNF", "PMAN", "PSF", 
                          "PBR", "SFM",  "PFAL", "PJW") &
           !year(date) %in% c(2021, 2022), !is.na(DOC)) %>% 
  mutate(dist_name = paste(distance_upstream_km, site_name, sep = " - "),
         ordered_name = factor(site_name, levels = c("Poudre at Bellvue Diversion",
                                                     "Poudre Above North Fork",
                                                     "Poudre at Manner's Bridge",
                                                     "Poudre Below S. Fork", 
                                                     "Poudre Below Rustic",
                                                     "Poudre South Fork",
                                                     "Poudre below Poudre Falls",
                                                     "Poudre Above Joe Wright"))) %>% 
  ggplot(., aes(x = yday(date), y = DOC, color = factor(year(date)))) + 
  geom_point() + 
  facet_wrap(ordered_name + distance_upstream_km +location_type + Campaign ~ .)

ROSS x FC

We’re just shooting for the mainstem forecast rignt now. After some perusing, NF is… weird… and definitely does not vibe with the mainstem. Need to remove High Park Fire 2013 late season?

toc %>% 
  mutate(toc_doc = if_else(!is.na(TOC), TOC, DOC)) %>% 
  filter(site_code %in% c("PBD", "PNF", "PMAN", "PSF", 
                          "PBR", "SFM",  "PFAL", "PJW"),
           !year(date) %in% c(2021, 2022),
           !(year(date) == 2013 & month(date) > 8)) %>%
  mutate(ordered_site = factor(site_code, levels = c("PBD", "PNF", "PMAN", "PSF", 
                                                     "PBR", "SFM",  "PFAL", "PJW"))) %>%
  
  ggplot(., aes(x = yday(date), y = toc_doc, color = collector)) +
  geom_point() +
  facet_wrap(year(date) ~ .)

GAM time

Great, now let’s play with a GAM fit, for now, we wont do a TVT, because I just want to see if it works. I have played with including a bunch or other sites, but this is the most straightforward group for forecasting the 4 mainstem sites.

gam_toc <- toc %>% 
  filter(site_code %in% c("PBD", "PNF", "PMAN", "PSF", 
                          "PBR", "SFM", "PFAL", "PJW")) %>% 
  mutate(DOY = yday(date),
         year = year(date), 
         toc_doc = if_else(!is.na(TOC), TOC, DOC),
         # to play with categorical variables, they failed.
         fire_signal = if_else(year %in% c(2021, 2022) | (year == 2013 & month(date) > 8), 1, 0),
         # there's one flier at SFM
         toc_doc = if_else(site_code == "SFM" & toc_doc > 9, NA_real_, toc_doc)) %>% 
  # limit time, remove fire signal, remove flier
  filter(!is.na(toc_doc), 
         between(DOY, 100, 320), 
         fire_signal == 0)

# see what this looks like
TOC_GAM <- gam(toc_doc ~ te(DOY, distance_upstream_km, k = c(6, 6)),
               method = "REML", gamma = 2,
               data = gam_toc)

gam.check(TOC_GAM)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-1.78983e-06,1.714016e-07]
## (score 1018.275 & scale 1.629889).
## Hessian positive definite, eigenvalue range [0.7045673,288.7644].
## Model rank =  36 / 36 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                k'  edf k-index p-value    
## te(DOY,distance_upstream_km) 35.0 27.5    0.87  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(TOC_GAM, scheme = 2, too.far = 1, shade = TRUE)

vis.gam(TOC_GAM, view = c("DOY", "distance_upstream_km"),
        plot.type = "persp", theta = 30, phi = 30,
        ticktype = "detailed")

Okay, this seems to work pretty well, even if it’s a little wonky and seemingly overfit, especially where we don’t have data ~60km upstream. We do, however, want to do some thoughtful TVT here to keep our error estimates honest. First let’s look at annual distribution since we want to test in 2025.

gam_toc %>% 
  summarize(n = n(), 
            .by = year)
## # A tibble: 16 × 2
##     year     n
##    <dbl> <int>
##  1  2014    60
##  2  2015    82
##  3  2023   139
##  4  2024   215
##  5  2025    97
##  6  2008    56
##  7  2009    63
##  8  2010    55
##  9  2011    61
## 10  2012    56
## 11  2013    41
## 12  2016    50
## 13  2017    55
## 14  2018    50
## 15  2019    50
## 16  2020    32

I think we can do some tvt across years here, keeping 2020, 2023-2024 always in the training set.

GAM diagnostics are interpreted using the gam.check() function. We want to make sure that:

  1. the model converged (second line of the output), the convergnece value is very small, and ideally model rank is last (26/26 for instance)
  2. We are confident we are not finding a local minima in the convergence (“Hessian was positive difinitive”)
  3. that the k is ~ 1 and the p value > 0.05
  4. we want the lowest k (default is 25 or c(5, 5)) while not maxing out the wiggles which is determined by k’ (max number of wiggles) and edf (actual wiggles used)

Some notes on interpretation and tuning:

My rule of thumb for making sure you don’t overfit: if the edf value of your model is less than the k’ of a lower k, you might be overfit:

edf = 20, k’ = 48 (implying the k = c(7,7), or 7x7-1), you’d want to try k = c(6,6), which would have a k’ of 35 (6x6-1), and ideally the edf would still be low with a smaller parameterization space.

Added: With the added data, we’re in a strange place with k-index/p-value. It’s okay to have significant p (p < 0.05) as long as the k-index is near 1 and we’ve optimized the other parameters of the fit summary. I’ve found that with added data I can counter act the significant p by adding a gamma value (this is to help over fitting) - 1 is default, increasing to 1.5-2 can help a ton.

Make CV Models

Make a function for easier iteration:

run_cv_gam <- function(block, k_vals, gamma_val = 1) {
  valid <- gam_toc %>% filter(year %in% block)
  train <- anti_join(gam_toc, valid) %>% 
    filter(year != 2025) 
  
  # fit gam
  fit <- gam(toc_doc ~ te(DOY, distance_upstream_km, k = c(6, 6)), 
             data = train, method = "REML", select = TRUE, gamma = gamma_val)
  
  # check gam diagnostics
  gam.check(fit)
  
  plot(fit, scheme = 2, too.far = 1, shade = TRUE)
  
  vis.gam(fit,
          view = c("DOY", "distance_upstream_km"),
          plot.type = "persp",
          theta = 30, phi = 30,
          ticktype = "detailed")
  
  # apply to validation
  valid$pred <- predict(fit, newdata = valid)
  
  # and report performance
  mae_val <- mae(valid$toc_doc, valid$pred)
  rmse_val <- rmse(valid$toc_doc, valid$pred)
  bias_val <- bias(valid$toc_doc, valid$pred)
  
  # r2 from linear regression
  r2_val <- summary(lm(valid$toc_doc ~ valid$pred))$r.squared
  
  # format annotation text for plot
  metrics_text <- paste0(
    "MAE = ", round(mae_val, 2),
    "\nRMSE = ", round(rmse_val, 2),
    "\nBias = ", round(bias_val, 2),
    "\nR² = ", round(r2_val, 2)
  )
  
  plot(valid$DOY, valid$toc_doc - valid$pred)
  plot(valid$distance_upstream_km, valid$toc_doc - valid$pred)
  
  valid_performance <- ggplot(valid, aes(y=toc_doc, x=pred)) +
    geom_point() +
    geom_abline(slope=1, intercept=0, color="red") +
    theme_bw() +
    annotate("text", x = min(valid$pred), y = max(valid$toc_doc), 
             hjust = 0, vjust = 1, 
             label = metrics_text, size = 4) +
    labs(title=paste("Predicted vs Observed - Years:", paste0(block, collapse = ", ")))
  
  valid_performance
  
  return(list(fit, valid_performance))
}

CV Model 1

cv_1 <- run_cv_gam(block = c(2008, 2009), k = c(6,6), gamma = 1.7)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-2.260611e-06,4.237299e-06]
## (score 967.8873 & scale 1.639017).
## Hessian positive definite, eigenvalue range [0.2778896,277.8538].
## Model rank =  36 / 36 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                k'  edf k-index p-value    
## te(DOY,distance_upstream_km) 35.0 18.6    0.88  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

CV Model 2

cv_2 <- run_cv_gam(block = c(2010, 2011), k = c(6,6), gamma = 1.7)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-2.091342e-05,1.851489e-05]
## (score 943.2304 & scale 1.475329).
## Hessian positive definite, eigenvalue range [0.4381471,278.7428].
## Model rank =  36 / 36 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                k'  edf k-index p-value   
## te(DOY,distance_upstream_km) 35.0 19.1    0.93   0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

CV Model 3

cv_3 <- run_cv_gam(block = c(2012, 2013), k = c(6, 6), gamma = 1.7)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-0.0001165118,0.0001037072]
## (score 974.5264 & scale 1.547977).
## Hessian positive definite, eigenvalue range [0.2266671,284.3286].
## Model rank =  36 / 36 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                k'  edf k-index p-value    
## te(DOY,distance_upstream_km) 35.0 19.1    0.77  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

CV Model 4

cv_4 <- run_cv_gam(block = c(2014, 2015), k = c(6,6), gamma = 1.5)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-0.0002180607,0.0004181635]
## (score 1078.705 & scale 1.707866).
## Hessian positive definite, eigenvalue range [0.4183872,307.2816].
## Model rank =  36 / 36 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                k'  edf k-index p-value    
## te(DOY,distance_upstream_km) 35.0 19.2    0.87  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

CV Model 5

cv_5 <- run_cv_gam(block = c(2016, 2017), k = c(6,6), gamma = 1.5)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-2.223133e-06,1.606759e-05]
## (score 1081.886 & scale 1.518902).
## Hessian positive definite, eigenvalue range [0.4146953,319.5977].
## Model rank =  36 / 36 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                k'  edf k-index p-value    
## te(DOY,distance_upstream_km) 35.0 18.1    0.91  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

CV Model 6

cv_6 <- run_cv_gam(block = c(2018, 2019), k = c(6,6), gamma = 1.5)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-2.079404e-05,1.322841e-05]
## (score 1122.325 & scale 1.688355).
## Hessian positive definite, eigenvalue range [0.4334171,321.2747].
## Model rank =  36 / 36 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                              k' edf k-index p-value    
## te(DOY,distance_upstream_km) 35  19     0.9  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Apply ensemble to test set

test_2025 <- gam_toc %>% filter(year == 2025)

# apply ensemble
test_2025 <- test_2025 %>%
  mutate(
    pred1 = predict(cv_1[[1]], newdata = test_2025),
    pred2 = predict(cv_2[[1]], newdata = test_2025),
    pred3 = predict(cv_3[[1]], newdata = test_2025),
    pred4 = predict(cv_4[[1]], newdata = test_2025),
    pred5 = predict(cv_5[[1]], newdata = test_2025),
    pred6 = predict(cv_6[[1]], newdata = test_2025),
    pred_avg = (pred1 + pred2 + pred3 + pred4 + pred5 + pred6) / 6
  )

# calculate performance metrics
mae_val  <- mae(test_2025$toc_doc, test_2025$pred_avg)
rmse_val <- rmse(test_2025$toc_doc, test_2025$pred_avg)
bias_val <- bias(test_2025$toc_doc, test_2025$pred_avg)

# r2 from linear regression
r2_val <- summary(lm(test_2025$toc_doc ~test_2025$pred_avg))$r.squared

# format annotation text for plot
metrics_text <- paste0(
  "MAE = ", round(mae_val, 2),
  "\nRMSE = ", round(rmse_val, 2),
  "\nBias = ", round(bias_val, 2),
  "\nR² = ", round(r2_val, 2)
)


# pred - obs scatter plot!
ggplot(test_2025, aes(x = toc_doc, y = pred_avg)) +
  geom_point(color = "blue", alpha = 0.6, size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Observed TOC (mg/L)",
       y = "Predicted TOC (ensemble, mg/L)",
       title = "GAM Ensemble Predictions vs Observed (2025)") +
  annotate("text", x = 3, y = 8, 
           hjust = 0, vjust = 1, 
           label = metrics_text, size = 4) +
  theme_bw(base_size = 14) 

# uncomment on new model
# ggsave(here(paste0("modeling/toc/toc_forecast/performance/baseline_wFC_2025_v", version_date, ".jpg")),
#        width = 8, height = 8, units = "in")

# for kicks, look at performance by site
ggplot(test_2025, aes(x = toc_doc, y = pred_avg)) +
  geom_point(color = "blue", alpha = 0.6, size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Observed TOC (mg/L)",
       y = "Predicted TOC (ensemble, mg/L)",
       title = "GAM Ensemble Predictions vs Observed (2025)") +
  theme_bw(base_size = 14) +
  coord_cartesian(xlim = c(3, 8), ylim = c(3, 8)) +
  facet_wrap(site_code ~ .)

And let’s look at performance if we only pull out the four sites that we care about for forecasting:

test_2025_four <- test_2025 %>% 
  filter(site_code %in% c("PFAL", "PBR", "PMAN", "PBD"))
# calculate performance metrics
mae_val  <- mae(test_2025_four$toc_doc, test_2025_four$pred_avg)
rmse_val <- rmse(test_2025_four$toc_doc, test_2025_four$pred_avg)
bias_val <- bias(test_2025_four$toc_doc, test_2025_four$pred_avg)

# r2 from linear regression
r2_val <- summary(lm(test_2025_four$toc_doc ~test_2025_four$pred_avg))$r.squared

# format annotation text for plot
metrics_text <- paste0(
  "MAE = ", round(mae_val, 2),
  "\nRMSE = ", round(rmse_val, 2),
  "\nBias = ", round(bias_val, 2),
  "\nR² = ", round(r2_val, 2)
)


# pred - obs scatter plot!
ggplot(test_2025_four, aes(x = toc_doc, y = pred_avg, color = collector)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Observed TOC (mg/L)",
       y = "Predicted TOC (ensemble, mg/L)",
       title = "GAM Ensemble Predictions vs Observed (2025)") +
  annotate("text", x = 3, y = 8, 
           hjust = 0, vjust = 1, 
           label = metrics_text, size = 4) +
  theme_bw(base_size = 14) +
  coord_cartesian(xlim = c(3, 8), ylim = c(3, 8))

# uncomment on new model
# ggsave(here(paste0("modeling/toc/toc_forecast/performance/baseline_wFC_focus_sites_2025_v", version_date, ".jpg")),
#        width = 8, height = 8, units = "in")

# for kicks, look at performance by site
ggplot(test_2025_four, aes(x = DOC, y = pred_avg)) +
  geom_point(color = "blue", alpha = 0.6, size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Observed TOC (mg/L)",
       y = "Predicted TOC (ensemble, mg/L)",
       title = "GAM Ensemble Predictions vs Observed (2025)") +
  theme_bw(base_size = 14) +
  coord_cartesian(xlim = c(3, 8), ylim = c(3, 8)) +
  facet_wrap(site_code ~ .)

And because we think there is something funky with FC data in 2025, we’ll also do some performance without those data

test_2025_four <- test_2025 %>% 
  filter(site_code %in% c("PFAL", "PBR", "PMAN", "PBD"),
         collector != "FC")
# calculate performance metrics
mae_val  <- mae(test_2025_four$toc_doc, test_2025_four$pred_avg)
rmse_val <- rmse(test_2025_four$toc_doc, test_2025_four$pred_avg)
bias_val <- bias(test_2025_four$toc_doc, test_2025_four$pred_avg)

# r2 from linear regression
r2_val <- summary(lm(test_2025_four$toc_doc ~test_2025_four$pred_avg))$r.squared

# format annotation text for plot
metrics_text <- paste0(
  "MAE = ", round(mae_val, 2),
  "\nRMSE = ", round(rmse_val, 2),
  "\nBias = ", round(bias_val, 2),
  "\nR² = ", round(r2_val, 2)
)


# pred - obs scatter plot!
ggplot(test_2025_four, aes(x = toc_doc, y = pred_avg, color = collector)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Observed TOC (mg/L)",
       y = "Predicted TOC (ensemble, mg/L)",
       title = "GAM Ensemble Predictions vs Observed (2025)") +
  annotate("text", x = 3, y = 8, 
           hjust = 0, vjust = 1, 
           label = metrics_text, size = 4) +
  theme_bw(base_size = 14) +
  coord_cartesian(xlim = c(3, 8), ylim = c(3, 8))

# uncomment on new model
# ggsave(here(paste0("modeling/toc/toc_forecast/performance/baseline_noFC_focus_sites_2025_v", version_date, ".jpg")),
#        width = 8, height = 8, units = "in")

Visualize the decision space for interpretability

GAMs are pretty interpretable when they have fewer interacting variables. In this case, we can just create a synthetic space/time space and apply the model across it to visualize what happens up the canyon over time.

# Make a prediction grid across DOY and distance
newdata <- expand.grid(
  DOY = seq(min(gam_toc$DOY), max(gam_toc$DOY)),
  distance_upstream_km = seq(min(gam_toc$distance_upstream_km),
                             max(gam_toc$distance_upstream_km))
)

# Get predictions from your GAMs for the grid
newdata$fit1 <- predict(cv_1[[1]], newdata = newdata)
newdata$fit2 <- predict(cv_2[[1]], newdata = newdata)
newdata$fit3 <- predict(cv_3[[1]], newdata = newdata)
newdata$fit4 <- predict(cv_4[[1]], newdata = newdata)
newdata$fit5 <- predict(cv_5[[1]], newdata = newdata)
newdata$fit6 <- predict(cv_6[[1]], newdata = newdata)

# create ensemble prediction
newdata <- newdata %>% 
  mutate(ave_fit = (fit1 + fit2 + fit3 + fit4 + fit5 + fit6) / 6)

# heatmap + contour plot
ggplot(newdata, aes(x = DOY, y = distance_upstream_km, fill = ave_fit)) +
  geom_tile() +
  geom_contour(aes(z = ave_fit), color = "white", alpha = 0.7) +
  scale_fill_viridis_c(option = "plasma") +
  labs(fill = "Predicted TOC",
    x = "Day of Year",
    y = "Distance upstream from Canyon Mouth (km)") +
  theme_bw(base_size = 14) 

# uncomment on new model
# ggsave(here(paste0("modeling/toc/toc_forecast/performance/baseline_wFC_visualize_model_v", version_date, ".jpg")),
#        width = 8, height = 8, units = "in")

Export Models and Metrics

And let’s save these models as the baseline and export the performance info for the CV and test sets. Only run this section if you want to save the models and export statistics.

# write_rds(cv_1[[1]], here(paste0("data/upper_clp_dss/modeling/TOC_baseline/TOC_GAM_fit1_v", version_date, ".rds")))
# write_rds(cv_2[[1]], here(paste0("data/upper_clp_dss/modeling/TOC_baseline/TOC_GAM_fit2_v", version_date, ".rds")))
# write_rds(cv_3[[1]], here(paste0("data/upper_clp_dss/modeling/TOC_baseline/TOC_GAM_fit3_v", version_date, ".rds")))
# write_rds(cv_4[[1]], here(paste0("data/upper_clp_dss/modeling/TOC_baseline/TOC_GAM_fit4_v", version_date, ".rds")))
# write_rds(cv_5[[1]], here(paste0("data/upper_clp_dss/modeling/TOC_baseline/TOC_GAM_fit5_v", version_date, ".rds")))
# write_rds(cv_6[[1]], here(paste0("data/upper_clp_dss/modeling/TOC_baseline/TOC_GAM_fit6_v", version_date, ".rds")))
# 
# ggsave(plot = cv_1[[2]], 
#        filename = here(paste0("modeling/toc/toc_forecast/performance/cv_1_performance_v", version_date, ".jpg")),
#        width = 8, height = 8, units = "in")
# ggsave(plot = cv_2[[2]], 
#        filename = here(paste0("modeling/toc/toc_forecast/performance/cv_2_performance_v", version_date, ".jpg")),
#        width = 8, height = 8, units = "in")
# ggsave(plot = cv_3[[2]],
#        filename = here(paste0("modeling/toc/toc_forecast/performance/cv_3_performance_v", version_date, ".jpg")),
#        width = 8, height = 8, units = "in")
# ggsave(plot = cv_4[[2]],
#        filename = here(paste0("modeling/toc/toc_forecast/performance/cv_4_performance_v", version_date, ".jpg")),
#        width = 8, height = 8, units = "in")
# ggsave(plot = cv_5[[2]],
#        filename = here(paste0("modeling/toc/toc_forecast/performance/cv_5_performance_v", version_date, ".jpg")),
#        width = 8, height = 8, units = "in")
# ggsave(plot = cv_6[[2]],
#        filename = here(paste0("modeling/toc/toc_forecast/performance/cv_6_performance_v", version_date, ".jpg")),
#        width = 8, height = 8, units = "in")